Here using the protein-coding gene subset from previous data processing to conduct differential expression analysis, and also build the main pipeline.
dir <- "/home/ubuntu/storage/tcga"
pcg_txi <- readRDS(file.path(dir,"txi.split/pcg.txi.rds"),refhook = NULL)
new_cdr <- read.csv(file.path(dir,"clinic/new.filt.cdr.csv"), row.names = 1)
#sample_info <- read.delim(file.path(dir,"clinic/sample.info.tsv"), header=T)
cfa <- pcg_txi$countsFromAbundance
Here 5 samples which had extremely low library size were filted out, so the sample size would go down from 6125 to 6120.
counts <- sort(colSums(cfa))
counts[1:10]
## TCGA-YG-AA3N-01A-11R-A38C-07 TCGA-EB-A97M-01A-11R-A38C-07
## 245766.7 314400.9
## TCGA-ER-A2NB-01A-12R-A18S-07 TCGA-BF-AAOX-01A-11R-A39D-07
## 328614.4 398275.3
## TCGA-XV-A9VZ-01A-11R-A38C-07 TCGA-BL-A13I-01A-11R-A277-07
## 471810.6 2922073.8
## TCGA-56-A49D-01A-11R-A24H-07 TCGA-CJ-4642-01B-01R-1305-07
## 3502730.1 6990537.2
## TCGA-14-0871-01A-01R-1849-01 TCGA-44-2665-01B-06R-A277-07
## 8030096.4 8060743.1
rm_id <- names(counts[1:5])
cfa <- as.data.frame(cfa)[, -which(colnames(cfa) %in% rm_id)]
cfa <- data.matrix(cfa)
new_cdr <- new_cdr[-which(rownames(new_cdr) %in% rm_id),]
dim(cfa)
## [1] 20449 6120
dim(new_cdr)
## [1] 6120 35
# Create dge object
dge <- DGEList(cfa)
# Filtering - remove rows that consistently have zero or very low counts
keep <- filterByExpr(dge)
table(keep) # remove 4873 genes
## keep
## FALSE TRUE
## 4870 15579
dge_filt <- dge[keep, ]
# Draw density plot
lcpm <- cpm(dge, log=T)
lcpm_f <- cpm(dge_filt, log=T)
nsamples <- ncol(dge)
dp_col <- get_colVector(nsamples)
par(mfrow=c(1,2), las=1)
draw_densityPlot(lcpm, dp_col, nsamples)
## integer(0)
draw_densityPlot(lcpm_f, dp_col, nsamples)
## integer(0)
# Apply TMM normalization method (optional...)
dge_filt <- calcNormFactors(dge_filt)
# Draw boxplots to see the effect, using 20 samples.
col_box <- get_colVector(20)
boxplot(lcpm_f[,1:20], las=2, col=col_box, main="log-CPM before normalization")
lcpm_f <- cpm(dge_filt, log=T)
boxplot(lcpm_f[,1:20], las=2, col=col_box, main="log-CPM after TMM normalization")
fData <- read.delim(file.path(dir, "annotation/fData.txt"), header = T, row.names = 1)
geneid <- rownames(dge_filt$counts)
anno <- fData[geneid,]
gl <- as.vector(anno$geneNames)
meta <- "Metastasisyes"
design1 <- model.matrix(~ Metastasis, data = new_cdr)
head(design1, 5)
## (Intercept) Metastasisyes
## TCGA-2Z-A9J3-01A-12R-A38C-07 1 0
## TCGA-EJ-7784-01A-11R-2118-07 1 0
## TCGA-MJ-A850-01A-11R-A355-07 1 0
## TCGA-ZG-A9LM-01A-11R-A41O-07 1 0
## TCGA-BW-A5NQ-01A-11R-A27V-07 1 0
The first coefficient estimates the mean log-expression for non-metastasized samples and play the role of an intercept. The second one estimates the difference between meta v.s. non-meta. This approach is equal to (~ 0 + Metastasis) then use contrast matrix [confirmed].
v1 <- voom(dge_filt, design1, plot = T)
fit1 <- lmFit(v1, design1)
fit1 <- eBayes(fit1)
tt1 <- topTable(fit1, number=Inf, genelist=gl, coef=meta)
head(tt1, 20)
## ID logFC AveExpr t P.Value
## ENSG00000138778 CENPE 1.1298105 3.008218 16.14582 1.844244e-57
## ENSG00000163993 S100P 2.3508990 2.053039 16.09008 4.380619e-57
## ENSG00000066279 ASPM 1.3395268 3.473725 16.01242 1.455696e-56
## ENSG00000126787 DLGAP5 1.3016569 2.879460 15.81695 2.920507e-55
## ENSG00000178999 AURKB 1.2331805 2.920284 15.80744 3.376327e-55
## ENSG00000112984 KIF20A 1.1832786 3.512663 15.71442 1.389282e-54
## ENSG00000186185 KIF18B 1.1530582 3.072270 15.71481 1.381099e-54
## ENSG00000183856 IQGAP3 1.2645420 3.905030 15.69480 1.870390e-54
## ENSG00000156970 BUB1B 0.9645109 3.413031 15.59714 8.176882e-54
## ENSG00000143228 NUF2 1.1746111 2.642412 15.58172 1.031356e-53
## ENSG00000142945 KIF2C 1.1744055 3.689352 15.39073 1.797319e-52
## ENSG00000175063 UBE2C 1.4345441 4.016835 15.34495 3.548526e-52
## ENSG00000138180 CEP55 1.2631996 3.326489 15.34041 3.795937e-52
## ENSG00000024526 DEPDC1 1.2634682 2.397007 15.16748 4.871793e-51
## ENSG00000075218 GTSE1 0.9520425 2.940667 15.11150 1.106663e-50
## ENSG00000169679 BUB1 1.1069413 3.600840 15.08936 1.529573e-50
## ENSG00000129173 E2F8 1.2971572 1.624598 15.01373 4.606822e-50
## ENSG00000166851 PLK1 1.1938695 3.994129 14.94652 1.221942e-49
## ENSG00000072571 HMMR 1.0366860 2.869561 14.94484 1.252001e-49
## ENSG00000171241 SHCBP1 0.9357017 2.749364 14.92614 1.641161e-49
## adj.P.Val B
## ENSG00000138778 2.873147e-53 119.7610
## ENSG00000163993 3.412283e-53 118.9046
## ENSG00000066279 7.559429e-53 117.7372
## ENSG00000126787 1.051996e-51 114.7435
## ENSG00000178999 1.051996e-51 114.5985
## ENSG00000112984 3.091947e-51 113.2152
## ENSG00000186185 3.091947e-51 113.2077
## ENSG00000183856 3.642350e-51 112.9203
## ENSG00000156970 1.415418e-50 111.4529
## ENSG00000143228 1.606749e-50 111.1865
## ENSG00000142945 2.545494e-49 108.3932
## ENSG00000175063 4.548992e-49 107.7130
## ENSG00000138180 4.548992e-49 107.6514
## ENSG00000024526 5.421262e-48 105.0740
## ENSG00000075218 1.149381e-47 104.2878
## ENSG00000169679 1.489326e-47 103.9872
## ENSG00000129173 4.221746e-47 102.7233
## ENSG00000166851 1.026575e-46 101.9209
## ENSG00000072571 1.026575e-46 101.8864
## ENSG00000171241 1.278382e-46 101.6052
volp1 <- volcanoplot(fit1, coef=meta, highlight = 4, names=gl)
map1 <- plotMA(fit1, main = "did not correct for cancer type")
summary(decideTests(fit1)) # 5309 down; 4652 up
## (Intercept) Metastasisyes
## Down 1181 4843
## NotSig 95 6105
## Up 14303 4631
draw_SigCounts(tt1)
plot(density(tt1$logFC), col="skyblue", lwd=3, main="logFC density (design 1)",xlab="logFC")
geneList <- rownames(tt1[1:3,])
draw_boxplot(dge_filt$counts,tt1,new_cdr,geneList[1])
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
draw_boxplot(dge_filt$counts,tt1,new_cdr,geneList[2])
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
draw_boxplot(dge_filt$counts,tt1,new_cdr,geneList[3])
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
Draw the heatmap for the top 50 DEGs from limma-voom results of metastasis v.s. non-metastasis comparison only.
select_gene <- rownames(tt1[(tt1$adj.P.Val < 0.05 & abs(tt1$logFC) > log2(2)),])
h1 <- draw_heatmap(v1, tt1, new_cdr, anno, select_gene[1:50])
design2 <- model.matrix(~ type + Metastasis, data = new_cdr)
head(design2,3)
## (Intercept) typeBRCA typeCESC typeCHOL
## TCGA-2Z-A9J3-01A-12R-A38C-07 1 0 0 0
## TCGA-EJ-7784-01A-11R-2118-07 1 0 0 0
## TCGA-MJ-A850-01A-11R-A355-07 1 0 0 0
## typeCOAD typeESCA typeGBM typeHNSC typeKICH
## TCGA-2Z-A9J3-01A-12R-A38C-07 0 0 0 0 0
## TCGA-EJ-7784-01A-11R-2118-07 0 0 0 0 0
## TCGA-MJ-A850-01A-11R-A355-07 0 0 0 0 0
## typeKIRC typeKIRP typeLGG typeLIHC typeLUAD
## TCGA-2Z-A9J3-01A-12R-A38C-07 0 1 0 0 0
## TCGA-EJ-7784-01A-11R-2118-07 0 0 0 0 0
## TCGA-MJ-A850-01A-11R-A355-07 0 0 0 0 0
## typeLUSC typeMESO typeOV typePAAD typePCPG
## TCGA-2Z-A9J3-01A-12R-A38C-07 0 0 0 0 0
## TCGA-EJ-7784-01A-11R-2118-07 0 0 0 0 0
## TCGA-MJ-A850-01A-11R-A355-07 0 0 0 0 0
## typePRAD typeREAD typeSARC typeSKCM typeSTAD
## TCGA-2Z-A9J3-01A-12R-A38C-07 0 0 0 0 0
## TCGA-EJ-7784-01A-11R-2118-07 1 0 0 0 0
## TCGA-MJ-A850-01A-11R-A355-07 0 0 1 0 0
## typeTGCT typeTHCA typeTHYM typeUCEC typeUCS
## TCGA-2Z-A9J3-01A-12R-A38C-07 0 0 0 0 0
## TCGA-EJ-7784-01A-11R-2118-07 0 0 0 0 0
## TCGA-MJ-A850-01A-11R-A355-07 0 0 0 0 0
## Metastasisyes
## TCGA-2Z-A9J3-01A-12R-A38C-07 0
## TCGA-EJ-7784-01A-11R-2118-07 0
## TCGA-MJ-A850-01A-11R-A355-07 0
v2 <- voom(dge_filt, design2, plot = T)
fit2 <- lmFit(v2, design2)
fit2 <- eBayes(fit2)
tt2 <- topTable(fit2, number=Inf,genelist = gl, coef=meta)
head(tt2, 20)
## ID logFC AveExpr t P.Value
## ENSG00000159217 IGF2BP1 0.3483215 0.84838389 5.697570 1.272091e-08
## ENSG00000170689 HOXB9 0.6554478 -0.11063298 5.555780 2.880689e-08
## ENSG00000040275 SPDL1 0.1630758 3.41878236 5.460150 4.945289e-08
## ENSG00000121552 CSTA -0.5232659 2.36487379 -5.365425 8.373709e-08
## ENSG00000115525 ST3GAL5 -0.2853079 4.03356596 -5.222934 1.819527e-07
## ENSG00000163815 CLEC3B -0.4002104 2.81816410 -4.864235 1.177866e-06
## ENSG00000160949 TONSL 0.1888062 4.24421099 4.794452 1.669946e-06
## ENSG00000117724 CENPF 0.2398036 5.07582928 4.755581 2.024301e-06
## ENSG00000050405 LIMA1 -0.1872162 6.63859912 -4.693093 2.749874e-06
## ENSG00000183145 RIPPLY3 0.3281734 0.04946417 4.753670 2.043468e-06
## ENSG00000160957 RECQL4 0.2257792 4.20401436 4.658038 3.260159e-06
## ENSG00000088305 DNMT3B 0.2390571 2.35754636 4.602601 4.257072e-06
## ENSG00000198901 PRC1 0.1902078 4.93012754 4.579998 4.742306e-06
## ENSG00000156970 BUB1B 0.1905644 3.41303079 4.551102 5.440164e-06
## ENSG00000183347 GBP6 -0.4282269 0.28070807 -4.583062 4.673547e-06
## ENSG00000112984 KIF20A 0.2216134 3.51266330 4.475419 7.764904e-06
## ENSG00000179163 FUCA1 -0.1765665 5.75068659 -4.476753 7.716744e-06
## ENSG00000183150 GPR19 0.2916359 -0.22194358 4.558829 5.244458e-06
## ENSG00000139880 CDH24 0.1930271 3.32901648 4.444816 8.952587e-06
## ENSG00000198768 APCDD1L 0.5021947 -0.80632787 4.499216 6.947104e-06
## adj.P.Val B
## ENSG00000159217 0.0001981790 8.606308
## ENSG00000170689 0.0002243913 8.096309
## ENSG00000040275 0.0002568089 8.028858
## ENSG00000121552 0.0003261350 7.480696
## ENSG00000115525 0.0005669282 6.822465
## ENSG00000163815 0.0030583294 4.989670
## ENSG00000160949 0.0035372434 4.807785
## ENSG00000117724 0.0035372434 4.615470
## ENSG00000050405 0.0042840291 4.322353
## ENSG00000183145 0.0035372434 4.234120
## ENSG00000160957 0.0046172748 4.190014
## ENSG00000088305 0.0052771703 3.906270
## ENSG00000198901 0.0052771703 3.829545
## ENSG00000156970 0.0052970200 3.728153
## ENSG00000183347 0.0052771703 3.697699
## ENSG00000112984 0.0060743694 3.396772
## ENSG00000179163 0.0060743694 3.377292
## ENSG00000183150 0.0052970200 3.349265
## ENSG00000139880 0.0066415405 3.275995
## ENSG00000198768 0.0060743694 3.159289
volp2 <- volcanoplot(fit2, coef=meta, highlight = 6, names=gl)
map2 <- plotMA(fit2, main = "correct for cancer type")
draw_SigCounts(tt2)
plot(density(tt2$logFC), col="steelblue", lwd=3, main="logFC density (design 2)",xlab="logFC")
geneList2 <- rownames(tt2[1:3,])
draw_boxplot(dge_filt$counts,tt2,new_cdr,geneList2[1])
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
draw_boxplot(dge_filt$counts,tt2,new_cdr,geneList2[2])
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
draw_boxplot(dge_filt$counts,tt2,new_cdr,geneList2[3])
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
## Warning: Computation failed in `stat_compare_means()`:
## argument "x" is missing, with no default
select_gene2 <- rownames(tt2[(tt2$adj.P.Val < 0.05 & abs(tt2$logFC) > log2(1.2)),])
h2 <- draw_heatmap(v2, tt2, new_cdr, anno, select_gene2)
gene_d1 <- rownames(tt1[tt1$adj.P.Val < 0.05,])
gene_d2 <- rownames(tt2[tt2$adj.P.Val < 0.05,])
ven1 <- venn(list(design1=gene_d1, design2=gene_d2), intersections = T)
isect <- attr(ven1, "intersection")
d2_only <- isect$design2
anno[d2_only,]$geneNames
## [1] LIMA1 CLP1 KLF2 PIANP APOBEC3A CCDC69 RAB38
## [8] IL1B LACTB DENND2C RNFT2 CDC42 GTF2B PFN2
## [15] SCN8A ENO2 SLCO1A2 PCDHB10 FAM46C SMPD3 CYP4B1
## [22] C15orf52 STX11 ZFP41 PHC1 ALOX5AP UPF3A ZNF517
## [29] FAM162B ZC2HC1A F13A1 SLC30A3 RHOA TMEFF1 KIF5A
## [36] PIK3CG CLEC10A KLRD1 MGAT5B B4GALNT1 CA3 PPP2R2A
## [43] C1orf21 FZD8 DNAH11 VAMP3 SBK1 ECHDC1 ZNF250
## [50] SH3BP5L SPECC1 SRGN RNF182 GDE1
## 56638 Levels: 5S_rRNA 7SK A1BG A1BG-AS1 A1CF A2M A2M-AS1 A2ML1 ... ZZZ3
tt1[d2_only,]$adj.P.Val
## [1] 0.30338066 0.19314615 0.75624681 0.60617291 0.18927227 0.21836588
## [7] 0.28809582 0.70717509 0.06290863 0.05712739 0.10336526 0.05427154
## [13] 0.58058122 0.57221630 0.44297623 0.83563621 0.67435176 0.77201654
## [19] 0.92883812 0.23529296 0.16387922 0.08044839 0.98393208 0.11715261
## [25] 0.34316408 0.12469296 0.56774195 0.22240135 0.16469080 0.61224165
## [31] 0.92359692 0.09247493 0.15267000 0.76679880 0.16471955 0.11404533
## [37] 0.70448115 0.93016051 0.07557638 0.07940169 0.14947612 0.16328159
## [43] 0.31679835 0.93408102 0.10672770 0.22666582 0.88481008 0.50200915
## [49] 0.46268151 0.41561274 0.12954772 0.15073853 0.05838829 0.70965129
tt2[d2_only,]$adj.P.Val
## [1] 0.004284029 0.007486101 0.007486101 0.007486101 0.007495182
## [6] 0.010675393 0.011557523 0.011557523 0.012230752 0.012645254
## [11] 0.013799841 0.013211830 0.013940479 0.013940479 0.013940479
## [16] 0.015462693 0.014556343 0.015756992 0.016977760 0.019050675
## [21] 0.019050675 0.020097219 0.020097219 0.021016541 0.021749304
## [26] 0.023570389 0.022648133 0.024064875 0.023570389 0.025687755
## [31] 0.025687755 0.025631322 0.026712462 0.028316918 0.031403939
## [36] 0.034072174 0.034487562 0.035582388 0.036964470 0.038123402
## [41] 0.036964470 0.036964470 0.036964470 0.038890980 0.038921748
## [46] 0.036964730 0.040674845 0.038123402 0.042913677 0.040026636
## [51] 0.043620728 0.044213687 0.048490775 0.047522907
w_dir <- "/home/ubuntu/storage/tcga/dea/whole"
write.table(tt1, file=file.path(w_dir,"table/all.types.d1.txt"), sep="\t", quote=F)
pdf(file.path(w_dir,"plot/all.d1.volcanoplot.pdf"))
volp1
## NULL
dev.off()
## png
## 2
pdf(file.path(w_dir,"plot/all.d1.maplot.pdf"))
map1
## NULL
dev.off()
## png
## 2
write.table(tt2, file=file.path(w_dir,"table/all.types.d2.txt"), sep="\t", quote=F)
pdf(file.path(w_dir,"table/all.d2.volcanoplot.pdf"))
volp2
## NULL
dev.off()
## png
## 2
pdf(file.path(w_dir,"plot/all.d2.maplot.pdf"))
map2
## NULL
dev.off()
## png
## 2
If the sequencing depth is reasonably consistent across the RNA samples. This approach will usually work well if the ratio of the largest library size to the smallest is not more than about 3-fold. It’s faster than limma-voom though using more memories.
## library size:
countsSum <- colSums(dge_filt$counts)
max(countsSum) / min(countsSum) # 55.29538
logCPM <- cpm(dge_filt, log=T, prior.count = 2.5)
fit_lt <- lmFit(logCPM, design1)
fit_lt <- eBayes(fit_lt, trend = T)
tt_lt <- topTable(fit_lt, number=Inf,genelist = gl, coef=meta)
head(tt_lt,20)
volcanoplot(fit_lt, coef=meta, highlight = 4, names=gl)
plotMA(fit_lt, main = "no correct for cancer type")